library(dplyr)
library(readr)
library(tibble)
library(tidyr)
library(purrr)
library(broom)
library(pheatmap)
library(plotly)
library(microbiome)
library(knitr)
Picture of obese mice
# load metadata
metadata <- read_tsv('Example/metadata.txt')
kable(metadata)
| X1 | Age | Air_temperature | Diet | Food_name | Country | Provider | Sex | Strain | Body_weight |
|---|---|---|---|---|---|---|---|---|---|
| ERR675518 | 17 | 30 | chow | D12450B | Norway | Taconic in Denmark | male | C57/BL6 | 25.62 |
| ERR675519 | 17 | 30 | HF | D12492 | Norway | Taconic in Denmark | male | C57/BL6 | 30.15 |
| ERR675520 | 17 | 30 | chow | D12450B | Norway | Taconic in Denmark | male | C57/BL6 | 26.61 |
| ERR675521 | 17 | 30 | HF | D12492 | Norway | Taconic in Denmark | male | C57/BL6 | 28.29 |
| ERR675522 | 17 | 30 | chow | D12450B | Norway | Taconic in Denmark | male | C57/BL6 | 26.49 |
| ERR675523 | 17 | 30 | HF | D12492 | Norway | Taconic in Denmark | male | C57/BL6 | 31.42 |
| ERR675524 | 17 | 30 | chow | D12450B | Norway | Taconic in Denmark | male | C57/BL6 | 27.70 |
| ERR675525 | 17 | 30 | HF | D12492 | Norway | Taconic in Denmark | male | C57/BL6 | 29.54 |
| ERR675526 | 17 | 30 | chow | D12450B | Norway | Taconic in Denmark | male | C57/BL6 | 27.48 |
| ERR675527 | 17 | 30 | chow | D12450B | Norway | Taconic in Denmark | male | C57/BL6 | 28.16 |
| ERR675528 | 17 | 30 | HF | D12492 | Norway | Taconic in Denmark | male | C57/BL6 | 30.08 |
| ERR675529 | 17 | 30 | chow | D12450B | Norway | Taconic in Denmark | male | C57/BL6 | 27.44 |
| ERR675530 | 17 | 30 | HF | D12492 | Norway | Taconic in Denmark | male | C57/BL6 | 34.15 |
| ERR675531 | 17 | 30 | HF | D12492 | Norway | Taconic in Denmark | male | C57/BL6 | 30.61 |
| ERR675532 | 17 | 30 | HF | D12492 | Norway | Taconic in Denmark | male | C57/BL6 | 30.84 |
We confirm that the mice on high fat diet really put more weight on.
ggplot(metadata, aes(x = Diet, y = Body_weight)) +
geom_point() +
theme_minimal()
# create a short label for each genome
Tax <- read_tsv('Example/Results/taxonomy.tsv') %>%
mutate(Labels = ifelse(is.na(species) & is.na(genus), paste0(family, " ", user_genome), species)) %>%
mutate(Labels = ifelse(is.na(Labels), paste0(genus, " ", user_genome), Labels))
For the relative abundance we take the coverage over the genome not the raw counts. This inmplicit normalizes for genome size. The coverage is calculated as the median of the coverage values calculated in 1kb blocks.
D <- read_tsv("Example/Results/counts/median_coverage_genomes.tsv") %>%
column_to_rownames(var = "X1")
# calculate relative abundance
rel_ab <- D/rowSums(D)
level <- 'family'
grouped_data <- rel_ab %>%
t() %>%
as.data.frame() %>%
rownames_to_column(var = "user_genome") %>%
pivot_longer(cols = -user_genome, names_to = "Sample", values_to = "rel_ab") %>%
left_join(Tax, by = "user_genome") %>%
left_join(metadata, by = c("Sample" = "X1")) %>%
group_by(Sample, family, Diet) %>%
summarise(summarized_rel_ab = sum(rel_ab))
ggplot(grouped_data, aes(x = Sample, y = summarized_rel_ab, fill = family)) +
geom_col() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_blank(),
axis.title.y = element_blank()) +
facet_wrap(~Diet, scales = "free_x")
We see that the high fat diet induces a big change from the Muribaculaceae family (Bacteroidetes) to the Lachnospiraceae family.
In order to analyze the microbiome at the species or genome level we use compositional data analyis (CoDa), see more on Wikipedia and this article:
Gloor, Gregory B., Jean M. Macklaim, Vera Pawlowsky-Glahn, and Juan J. Egozcue. 2017. “Microbiome Datasets Are Compositional: And This Is Not Optional.” Frontiers in Microbiology 8 (November). Frontiers: 2224. doi: 10.3389/fmicb.2017.02224.
For this we load the rawcounts and use centric log ratios (clr) after imputing values for the zeros.
#load raw counts
Counts <- read_tsv('Example/Results/counts/raw_counts_genomes.tsv') %>%
column_to_rownames(var = "Sample") %>%
t()
# transforme counts with centrig log ratio
data <- transform(Counts, transform = "clr")
transformed_data <- prcomp(data)
pca_data <- transformed_data$x %>%
as.data.frame() %>%
rownames_to_column(var = "Sample") %>%
left_join(metadata, by = c("Sample" = "X1"))
ggplot(pca_data, aes(x = PC1, y = PC2, color = Diet)) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c(chow = "#00BFC4", HF = "#F8766D"))
As the counts are normalized in centered log ratio the log FC becomes the difference.
We use the welch test to assess differential abundance in the two groups. This is a simple version of aldex2. See Gloor et al for more information.
# mean abundance per group
Stats <- data %>%
as.data.frame() %>%
rownames_to_column(var = "Sample") %>%
pivot_longer(cols = -Sample, names_to = "Id", values_to = "clr") %>%
left_join(metadata, by = c("Sample" = "X1")) %>%
group_by(Diet, Id) %>%
summarize(mean_clr = mean(clr)) %>%
pivot_wider(id_cols = Id, names_from = Diet, values_from = mean_clr)
# calculate logFC
Stats <- Stats %>%
mutate(logFC = HF - chow)
# format data for t test
data_ttest <- data %>%
as.data.frame() %>%
rownames_to_column(var = "Sample") %>%
pivot_longer(cols = -Sample, names_to = "Id", values_to = "clr") %>%
left_join(metadata, by = c("Sample" = "X1"))
# run t test
data_ttest <- data_ttest %>%
group_by(Id, Diet) %>%
nest() %>%
spread(key = Diet, value = data) %>%
mutate(
t_test = map2(HF, chow, ~{t.test(.x$clr, .y$clr) %>% tidy()}),
HF = map(HF, nrow),
chow = map(chow, nrow)
) %>%
unnest() %>%
select(Id, p.value) %>%
mutate(Pvalue = p.value) %>%
mutate(logP = -log10(Pvalue))
Stats <- Stats %>%
left_join(data_ttest, by = "Id") %>%
left_join(Tax, by = c("Id" = "user_genome"))
Correcting form multiple testing would be even better
# filter to MAG abundances that were significantly different
sig_data <- data %>%
as.data.frame() %>%
select(Stats[Stats$Pvalue < 0.01, ]$Id) %>%
t()
# make a dataframe to use to annotate the heatmap
annot_df <- data.frame(Sample = colnames(sig_data)) %>%
left_join(metadata, by = c("Sample" = "X1")) %>%
column_to_rownames(var = "Sample") %>%
select(Diet)
# sort labels by sig_data order
heatmap_labels <- Tax %>%
filter(user_genome %in% rownames(sig_data))
heatmap_labels <- heatmap_labels[order(match(heatmap_labels$user_genome, rownames(sig_data))), ]
pheatmap(sig_data, cluster_rows = T, cluster_cols = T, annotation_col = annot_df,
labels_row = heatmap_labels$Labels)
# non interactive plot
ggplot(Stats, aes(x = logFC, y = logP, alpha = logP)) +
geom_point(color = "#67000d") +
theme_minimal()
plt <- ggplot(Stats, aes(x = logFC, y = logP, alpha = logP,
label = Labels, label2 = HF, label3 = chow)) +
geom_point(color = "#67000d") +
theme_minimal()
ggplotly(plt, tooltip = c("label", "label2", "label3"))
The uncultured species with the name ‘UBA7173 sp001689485’ is highly significantly increased in chow mice vs HF mice. It belongs to the Muribaculaceae family.
ggplot(data %>%
as.data.frame %>%
rownames_to_column(var = "Sample") %>%
left_join(metadata, by = c("Sample" = "X1")),
aes(y = MAG10, x = Diet, fill = Diet)) +
geom_boxplot() +
theme_minimal() +
scale_fill_manual(values = c(chow = "#00BFC4", HF = "#F8766D"))
kable(Tax %>%
filter(user_genome == 'MAG10'))
| user_genome | kindom | phylum | class | order | family | genus | species | Labels |
|---|---|---|---|---|---|---|---|---|
| MAG10 | Bacteria | Bacteroidota | Bacteroidia | Bacteroidales | Muribaculaceae | UBA7173 | UBA7173 sp001689485 | UBA7173 sp001689485 |